Loading the packages

library(tidyverse)
library(tidymodels)
library(skimr)
library(MESS)
library(GGally)
library(tableone)
library(blandr)
library(ggcorrplot)
#library(colino) # en developpement
#devtools::install_github("stevenpawley/recipeselectors")
# library(recipeselectors)
tidymodels_prefer()
library(DALEXtra)
library(DataExplorer)
library(naniar)
library(finetune)
library(stacks)
library(vip)

Loading of the data

summary of data

skim(isavuco)
Data summary
Name isavuco
Number of rows 193752
Number of columns 7
_______________________
Column type frequency:
factor 2
numeric 5
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
id 0 1 FALSE 897 1_1: 216, 1_1: 216, 1_5: 216, 10_: 216
dose_group 0 1 FALSE 3 10m: 64584, 15m: 64584, 5mg: 64584

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
time 0 1 108.50 62.35 1.00 54.75 108.50 162.25 216.00 ▇▇▇▇▇
out 0 1 6.05 5.09 0.01 2.68 4.75 7.76 53.59 ▇▁▁▁▁
weight 0 1 47.64 9.37 15.39 41.79 47.75 53.70 77.23 ▁▃▇▅▁
age 0 1 10.06 2.67 2.79 8.27 10.05 11.88 16.67 ▁▃▇▅▂
dose 0 1 476.39 219.25 76.97 267.21 469.45 637.09 1158.50 ▇▇▇▃▁

plot of conc

isavuco %>% 
  ggplot(aes(x = time, y = out)) + 
  geom_point(alpha = 0.1) + 
  labs(title = "Individual simulated isavuconazole profiles", x = "'Time (h)", y = "Isavuconazole concentrations (mg/L)") + 
  theme_bw()

Calculation of reference AUC to be predicted

AUC from 192 to 216 h use of the MESS:auc function

# extraction of time interval

trap1 <- isavuco %>% 
  filter(between(time, 192,216)) %>% 
  group_by(id) %>% 
  mutate(auc_024 = auc(time, out))

## plot of AUC ref

trap1 %>% 
  filter(time == 192) %>% 
  ggplot(aes(x = auc_024)) + 
  geom_histogram() + 
  labs(title = "Distribution of the simulated AUC at steady state", x = "'AUC isavuconazole") + 
  theme_bw() +
  scale_x_continuous(trans=scales::pseudo_log_trans(base = 10))

# ggsave("Distribution_auc_sim_191220.tiff", dpi = 100)

We have to remove extreme auc values “outliers, let’s remove 1% of the observations at the 2 extrems

table_quant <- trap1 %>% 
  filter(time == 192) %>% 
  ungroup() %>% 
  reframe(quantiles = quantile(auc_024, c(0.01, 0.5, 0.99)), q = c(0.01, 0.5, 0.99), n = n())
quantile_1 = table_quant [[1,1]]
quantile_99 = table_quant [[3,1]]

Feature engineering

Extraction of concentrations used as predictor and creation of binned column We will try to predict from concentrations at 48h (trough) to 60h and make selection of important features based on boruta algorithm

isavuco_pred <- isavuco %>% 
  filter(between(time,48,60)) %>% 
  mutate(
  ## creation of binned column for time
  time_bin = case_when(
time == 48 ~ "conc_0",
time == 49 ~"conc_1", 
time == 50 ~"conc_2",
time == 51 ~ "conc_3",
time == 52 ~"conc_4", 
time == 53 ~"conc_5",
time == 54 ~ "conc_6",
TRUE ~ NA
# time == 55 ~"conc_7", 
# time == 56 ~"conc_8",
# time == 57 ~ "conc_9",
# time == 58 ~"conc_10", 
# time == 59 ~"conc_11",
# time == 60 ~"conc_12"
         )) %>% 
  filter(!is.na(time_bin))
## calculation of relative difference between theoretical time and observed time 
# diff_rel_time = case_when(
#           time==0 ~ 0,
#          between(time,0.85,1.15) ~ (time - 1)/1,
#          between(time,2.8,3.2) ~ (time - 3)/3,
#          TRUE~100)) %>% filter(diff_rel_time<50) %>% mutate_if(is.character,factor)
# 

## filter patient with exactly 2 rows for the 2 times
# isavuco_pred %>% group_by(id) %>% filter(n() == 2) %>% ungroup() 

Preparation of the file for tidymodels

  1. Merge with outcome
  2. Pivot column to have only one row per patient
# pivot conc
isavuco_pivot <- isavuco_pred %>% 
  pivot_wider(id_cols=id ,names_from = time_bin, values_from = c(out)) %>%
## join with variable non pivot
  left_join(isavuco_pred %>% dplyr::select(id, dose_group:dose) %>% 
              distinct(id,.keep_all = T))


# merge with AUC
isavuco_ml <- isavuco_pivot %>% 
  left_join(trap1) %>% 
  #remove extreme AUC values
  filter(between(auc_024, quantile_1, quantile_99)) %>% 
  select(-out, -time) %>% 
  distinct(conc_0, .keep_all = TRUE)

variable selection

preparaiton of data for vairbale selection

# pivot conc
isavuco_pivot_all <- isavuco_pred %>% pivot_wider(id_cols=id ,names_from = time_bin, values_from = c(out)) %>%
## join with variable non pivot
  left_join(isavuco_pred ) %>%  
  distinct(id,.keep_all = T) %>% 
  left_join(trap1 %>% select(auc_024)) %>% 
    filter(between(auc_024, quantile_1, quantile_99)) %>% 
  select( -time, -time_bin) %>% 
  distinct(conc_0, .keep_all = TRUE)  

random forrest vip plots

rf_model <- rand_forest(mode = "regression", trees = 500) %>%
  set_engine("ranger", importance = "permutation")

rf_fit <- rf_model  %>% 
  fit(auc_024 ~ ., data = isavuco_pivot_all %>%
  dplyr::select(-id, -out))

vip(rf_fit)  # Plot feature importance

Boruta

https://towardsdatascience.com/boruta-explained-the-way-i-wish-someone-explained-it-to-me-4489d70e154a

library(Boruta)

boruta_model <- Boruta(auc_024 ~ ., data = isavuco_pivot_all, doTrace = 2)
final_features <- getSelectedAttributes(boruta_model, withTentative = TRUE)

print(final_features)  # List of selected features
##  [1] "conc_0"     "conc_1"     "conc_2"     "conc_3"     "conc_4"    
##  [6] "conc_5"     "conc_6"     "out"        "dose_group" "weight"    
## [11] "age"        "dose"

recursive feature elimination

library(caret)

control <- rfeControl(functions = rfFuncs, method = "cv", number = 5)

rfe_model <- rfe(isavuco_pivot_all %>% select(-id, -auc_024, -dose_group), isavuco_pivot_all$auc_024,
                 sizes = c(2,3,4,5, 10), rfeControl = control)

print(rfe_model)  # Selected features
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (5 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables  RMSE Rsquared   MAE RMSESD RsquaredSD MAESD Selected
##          2 46.65   0.6303 32.51  4.084    0.06077 2.860         
##          3 36.26   0.7776 23.42  2.823    0.03853 2.209         
##          4 35.33   0.7894 22.45  2.948    0.04005 2.124         
##          5 35.16   0.7919 22.50  2.662    0.03594 1.854         
##         10 34.74   0.7955 21.59  2.889    0.04063 1.912        *
##         11 34.98   0.7936 22.04  2.723    0.03770 1.842         
## 
## The top 5 variables (out of 10):
##    dose, conc_1, conc_6, out, conc_0

creation of informative predictor

Calculation of differences between the remaining concentrations)

isavuco_ml <- isavuco_ml %>% 
  mutate(diff_conc = conc_1 - conc_0,
         ratio_conc = conc_1/conc_0) %>% 
  select(id, conc_0, conc_1, dose_group:auc_024,diff_conc, ratio_conc)

graphical exploration

ggp <- isavuco_ml %>% 
  select(-id) %>% 
  ggpairs(aes(color = dose_group)) 
print(ggp, progress = F)

isavuco_ml %>% ggplot(aes(x = conc_0)) + geom_histogram()

isavuco_ml %>% ggplot(aes(x = conc_1)) + geom_histogram()

isavuco_ml %>% ggplot(aes(x = dose)) + geom_histogram()

isavuco_ml %>% ggplot(aes(x = diff_conc)) + geom_histogram()

isavuco_ml %>% ggplot(aes(x = ratio_conc)) + geom_histogram()

Correlation matrix for continuous data only

##Correlation Analysis
  cor_data <- cor(isavuco_ml %>% select_if(is.numeric), use = "complete.obs")

  
# plots
ggcorrplot(cor_data, hc.order = TRUE, type = "lower",
           lab = FALSE,  pch.cex = 5,
  tl.cex = 6, lab_size = 2)

Graphical exploraiton boxplots package DataExplorer

# variables continues
plot_histogram(isavuco_ml) 

plot_histogram(isavuco_ml, scale_x = "log10", ggtheme = theme_bw()) 

Visualisation of missing data

vis_miss(isavuco_ml) + theme(
  axis.text = element_text(size = 3),  # Adjust the size as needed
  axis.title = element_text(size = 6) # Adjust the size as needed
)

We will log transform concentrations, amount, and diff and ratio of concentrations

data splitting

Function usefull for longitudinal data (PK data)
group_initial_split(Orthodont, group = Subject, prop = 2 / 3) ; this is not the case here
set.seed(1234)
isavuco_split<- initial_split(isavuco_ml, strata = auc_024, prop=3/4)
isavuco_ml_train  <- training(isavuco_split )
isavuco_ml_test  <- testing(isavuco_split )

# To put 60% into training, 20% in validation, and 20% in testing:
#isavuco_val_split <- initial_validation_split(isavuco_ml, prop = c(0.6, 0.2))
# isavuco_ml_train <- training(isavuco_val_split)
# isavuco_ml_test <- testing(isavuco_val_split)
# isavuco_ml_val <- validation(isavuco_val_split)

description dataset train and test

#cut the 2 vairbales with their names
isavuco_dev<-isavuco_ml_train %>% mutate (type = "dev")
isavuco_val<-isavuco_ml_test %>% mutate (type = "val")
isavuco_des<-full_join(isavuco_dev, isavuco_val)

#recuperation des noms
# dput(names((isavuco_ml_train)))
## Vector of categorical variables that need transformation
catVars <- c("dose_group")
## Create a variable list.
vars <- c("conc_0", "conc_1", "dose_group", "weight", "age", "dose", 
"auc_024", "diff_conc")
tableOne <- CreateTableOne(vars = vars, strata = "type",factorVars = catVars, data = isavuco_des)
tableOne2<-print(tableOne, nonnormal = c("conc_0", "conc_1", "weight", "age", "dose", 
"auc_024", "diff_conc", "ratio_conc"), printToggle=F, minMax=T)
tableOne2b<-print(tableOne, nonnormal = c("conc_0", "conc_1", "weight", "age", "dose", 
"auc_024", "diff_conc", "ratio_conc"), printToggle=F, minMax=F)
dev val p test
n 659 220
conc_0 (median [range]) 6.03 [0.24, 36.96] 6.42 [0.49, 36.03] 0.426 nonnorm
conc_1 (median [range]) 8.31 [0.26, 49.35] 9.00 [0.52, 52.43] 0.342 nonnorm
dose_group (%) 0.828
10mg/kg 225 (34.1) 72 (32.7)
15mg/kg 213 (32.3) 76 (34.5)
5mg/kg 221 (33.5) 72 (32.7)
weight (median [range]) 47.78 [15.39, 77.23] 47.48 [15.39, 66.45] 0.795 nonnorm
age (median [range]) 10.18 [2.79, 16.67] 9.75 [2.79, 15.96] 0.199 nonnorm
dose (median [range]) 466.55 [76.97, 1158.50] 473.60 [134.63, 974.64] 0.775 nonnorm
auc_024 (median [range]) 113.49 [16.02, 389.65] 114.37 [23.20, 399.09] 0.939 nonnorm
diff_conc (median [range]) 2.06 [0.02, 19.51] 2.23 [0.03, 18.68] 0.463 nonnorm
dev val p test
n 659 220
conc_0 (median [IQR]) 6.03 [3.47, 10.11] 6.42 [3.29, 10.76] 0.426 nonnorm
conc_1 (median [IQR]) 8.31 [5.07, 14.45] 9.00 [4.88, 14.76] 0.342 nonnorm
dose_group (%) 0.828
10mg/kg 225 (34.1) 72 (32.7)
15mg/kg 213 (32.3) 76 (34.5)
5mg/kg 221 (33.5) 72 (32.7)
weight (median [IQR]) 47.78 [41.52, 54.14] 47.48 [42.22, 52.97] 0.795 nonnorm
age (median [IQR]) 10.18 [8.40, 11.88] 9.75 [8.19, 11.40] 0.199 nonnorm
dose (median [IQR]) 466.55 [267.18, 633.88] 473.60 [267.59, 637.07] 0.775 nonnorm
auc_024 (median [IQR]) 113.49 [71.38, 170.20] 114.37 [72.73, 168.70] 0.939 nonnorm
diff_conc (median [IQR]) 2.06 [1.04, 4.44] 2.23 [0.99, 4.50] 0.463 nonnorm

Resampling object

##hyperparameters
set.seed(2345)
folds <- vfold_cv(isavuco_ml_train, strata = auc_024)# by default 1 time but use repeats= to change, v=10 fois, strat to keep the initial distribution 

###resample#####
set.seed(456)
folds_cv <- vfold_cv(isavuco_ml_train, strata = auc_024)#par défaut 10 fois

pre processing

https://recipes.tidymodels.org/reference/index.html

Normalisations usuelles à appliquer par algortihme utilisés https://www.tmwr.org/pre-proc-table.html

isavuco_ml_rec  <- recipe(auc_024 ~ ., data = isavuco_ml_train) %>%
  update_role(id, new_role = "id") %>% 
  step_rm(dose_group) %>%
  step_zv(all_predictors()) %>% 
  step_log(contains("conc"), offset = 0.0001) %>%
  step_YeoJohnson(dose) %>%
  step_normalize(all_numeric_predictors()) 
# si categorical cov, one hot encoding avec step_dummy --> allonge modele rf or xgb : garder factor dans ces modeles
# si trop de categories reduire avec step_other

isavuco_ml_rec_prep <-  prep(isavuco_ml_rec )
isavuco_train_recipe <-bake(isavuco_ml_rec_prep, new_data = NULL)
isavuco_test_recipe <-bake(isavuco_ml_rec_prep, new_data = isavuco_ml_test)

Encoding different form one hot in the case of a variable with many different factors. We can replace by the mean outcome value for a given factor (regression) or the probability of outcome (classification). In the case of a level with a small sampling size or only 1 value, there is more uncertainty and we can use partial pooling with mixed models : shrink values toward the mean. With mixed effect models, the effects for each level are modeled all at once using a mixed generalized linear model

Example non run of encoding a factor using the mean of the outcome

library(embed)
# step_lencode_glm(factor_variable, outcome = vars(outcome)) %>%
#step_lencode_mixed(factor_variable , outcome = vars(outcome)) %>%
# need to be tuned to prevent overfitting
#interesting because can manage new categories


# to see the encoded values
# glm_estimates <-
#   prep(name_of_recipe) %>%
#   tidy(number = 2)# 2 being the position of the step in the recipe 

model & workflow Xgboost

https://parsnip.tidymodels.org/reference/boost_tree.html

Parameters to tune

# #model

xgb_spec <- boost_tree(mode = "regression",
                        mtry = tune(),
                        trees = tune(),
                        min_n = tune(),
                       sample_size = tune(),
                        tree_depth = tune(),
                        learn_rate = tune()) %>% 
  set_engine("xgboost")



#workflow model+recipe
xgb_wf <- workflow() %>%
  add_recipe(isavuco_ml_rec) %>%
  add_model(xgb_spec)
#

By default, hyperparameters values prespecified and optimsed for hyperparameters, however, it is possible to change. For example, mtry max is caluclated automatically when fitting from the number of fetaures (sqrt(param))

# xgb_wf %>% extract_parameter_dials("mtry")
# 
# # update value
# xgb_param <- extract_parameter_set_dials(xgb_wf) %>% 
#   update(mtry = mtry(c(2, 7)))

Example of grid for tuning

Space-filling designs can be very effective at representing the parameter space. The default design used by the tune package is the maximum entropy design. These tend to produce grids that cover the candidate space well and drastically increase the chances of finding good results.

# # reguliere
# grid_regular(xgb_param, levels = 2)
# 
# # irreguliere
# set.seed(1301)
# xgb_param %>% 
# grid_latin_hypercube(size = 20) %>% # 'size' is the number of combinations
#   summary()

Tuning the hyperparameters

With paralell processing

https://www.tidymodels.org/start/tuning/ https://www.tidymodels.org/learn/work/bayes-opt/

library(doParallel)
# Step 1: Set up parallel backend
num_cores <- parallel::detectCores() - 1  # Use all but one core to leave resources for the system
cl <- makeCluster(num_cores)             # Create a cluster with the specified number of cores
registerDoParallel(cl)                   # Register the cluster for parallel processing

# Inform the user about the parallel setup
message("Parallel backend set up with ", num_cores, " cores.")

# Step 2: Define the tuning process with parallel control
set.seed(234)  # Set seed for reproducibility

tune_xgb <- tune_grid(
  xgb_wf,  # Workflow object
  resamples = folds,  # Cross-validation folds
  grid = 60,  # Number of tuning parameter combinations
  control = control_grid(
    verbose = TRUE,          # Display progress
    allow_par = TRUE,        # Enable parallel processing
    parallel_over = "everything",  # Parallelize across resamples and grid combinations,
    save_pred = TRUE,
    save_workflow = TRUE
  )
)

# Step 3: Stop the cluster after tuning
stopCluster(cl)          # Shut down the parallel cluster
registerDoSEQ()          # Revert to sequential execution
message("Parallel backend stopped and reverted to sequential execution.")

#View results resultats

autoplot(tune_xgb, metric = "rmse",  scientific = FALSE) +
  theme_bw() +
  ggtitle("tuning hyperparameter")

Example of tuning with finetune

Race approach: here, the tuning process evaluates all models on an initial subset of resamples. Based on their current performance metrics, some parameter sets are not considered in subsequent resamples.

library(finetune)

set.seed(234)
tune_xgb_ft <-
  tune_race_anova(
    xgb_wf,
    folds,
    grid = 60,
    control = control_race(verbose_elim = TRUE)
  )

tune_xgb_ft
## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 × 5
##    splits           id     .order .metrics            .notes          
##    <list>           <chr>   <int> <list>              <list>          
##  1 <split [591/68]> Fold03      1 <tibble [120 × 10]> <tibble [0 × 3]>
##  2 <split [592/67]> Fold05      2 <tibble [120 × 10]> <tibble [0 × 3]>
##  3 <split [595/64]> Fold10      3 <tibble [120 × 10]> <tibble [0 × 3]>
##  4 <split [595/64]> Fold06      4 <tibble [20 × 10]>  <tibble [0 × 3]>
##  5 <split [595/64]> Fold08      5 <tibble [8 × 10]>   <tibble [0 × 3]>
##  6 <split [591/68]> Fold01      6 <tibble [6 × 10]>   <tibble [0 × 3]>
##  7 <split [591/68]> Fold04      7 <tibble [6 × 10]>   <tibble [0 × 3]>
##  8 <split [591/68]> Fold02      8 <tibble [6 × 10]>   <tibble [0 × 3]>
##  9 <split [595/64]> Fold09      9 <tibble [6 × 10]>   <tibble [0 × 3]>
## 10 <split [595/64]> Fold07     10 <tibble [6 × 10]>   <tibble [0 × 3]>
plot_race(tune_xgb_ft)

iterative tuning

During the search process, predict which values to test next. These methods can be used in a second time after a classical tune_grid to optimise the hyperparameters

Bayesian optimization

# # ## set initial values
# # set.seed(345)
# # tune_xgb_initial <- tune_grid(
# #   xgb_wf,
# #   resamples = folds
# # )
# 
# # bayesian optimisation
# set.seed(1403)
# xgb_bo <-
#   xgb_wf %>%
#   tune_bayes(
#     resamples = folds,
#     initial = tune_xgb,# ne foncitnone pas apres un tune race
#     iter = 25,
#     control = control_bayes(verbose = TRUE)
#   )
# # reuslt
# show_best(xgb_bo)
# 
# # visualisaiton of perfs
# autoplot(xgb_bo, type = "performance")

Simulated annealing

The process of using simulated annealing starts with an initial value and embarks on a controlled random walk through the parameter space. Each new candidate parameter value is a small perturbation of the previous value that keeps the new point within a local neighborhood.

# ## set initial values
# set.seed(345)
# tune_xgb_initial <- tune_grid(
#   xgb_wf,
#   resamples = folds
# )

# simulated annealing optimisation (library(finetune))

set.seed(1404)
xgb_sa <-
  xgb_wf %>%
  tune_sim_anneal(
    resamples = folds,
    initial = tune_xgb,
    iter = 30,
    control = control_sim_anneal(verbose = TRUE, no_improve = 10L)
  )

# visualisation perfs
autoplot(xgb_sa, type = "performance")

# visualisation params
autoplot(xgb_sa, type = "parameters")

Selection of the best model

#visualisation des meilleures combinaisons
show_best(xgb_sa)
## # A tibble: 5 × 13
##    mtry trees min_n tree_depth learn_rate sample_size .metric .estimator  mean
##   <int> <int> <int>      <int>      <dbl>       <dbl> <chr>   <chr>      <dbl>
## 1     6  1126     4          8     0.0437       0.232 rmse    standard    19.3
## 2     5  1234     6          8     0.0705       0.319 rmse    standard    19.4
## 3     6  1186     6          9     0.0580       0.252 rmse    standard    19.6
## 4     7  1243     4          9     0.0484       0.308 rmse    standard    19.7
## 5     7  1002     2          7     0.0271       0.216 rmse    standard    19.7
## # ℹ 4 more variables: n <int>, std_err <dbl>, .config <chr>, .iter <int>
#choix du best model
best_rmse_xgb <- select_best(xgb_sa)

final_xgb <- finalize_model(
  xgb_spec,
  best_rmse_xgb
)

final_xgb
## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   mtry = 6
##   trees = 1126
##   min_n = 4
##   tree_depth = 8
##   learn_rate = 0.0437252119228921
##   sample_size = 0.231523068694691
## 
## Computational engine: xgboost

finalise workflow

#finalize workflow (fitted)
final_wf_xgb <- workflow() %>%
  add_recipe(isavuco_ml_rec) %>%
  add_model(final_xgb) %>% 
  fit(isavuco_ml_train)

#finalize workflow (non fitted for last fit)
final_wf_xgb_non_fit <- workflow() %>%
  add_recipe(isavuco_ml_rec) %>%
  add_model(final_xgb) 

importance plot

Allows to evaluate which variable are of interest and their weights

library(vip)
xgb_fit <- extract_fit_parsnip(final_wf_xgb)
vip(xgb_fit, geom = "point", num_features = 10) + theme_bw()

crossvalidation

## 10 fold CV
xgb_rs<- fit_resamples(object = final_wf_xgb, resamples = folds_cv, control = control_resamples(verbose=TRUE, save_pred = TRUE, save_workflow = TRUE))

##perf resample
xgb_rs %>% collect_metrics()
## # A tibble: 2 × 6
##   .metric .estimator   mean     n std_err .config             
##   <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   20.8      10  1.86   Preprocessor1_Model1
## 2 rsq     standard    0.920    10  0.0133 Preprocessor1_Model1
pred_obs_xgb <- xgb_rs%>% collect_predictions() %>%
   mutate (bias_rel = (.pred - auc_024)/auc_024,
          bias_rel_square = bias_rel * bias_rel)
# write.csv2(pred_obs_xgb, file = "pred_obs_xgb_tac_c0h.csv")

#relative bias & rmse
pred_obs_xgb%>% summarise(biais_rel = mean(bias_rel), relative_rmse = sqrt(mean(bias_rel_square)))
## # A tibble: 1 × 2
##   biais_rel relative_rmse
##       <dbl>         <dbl>
## 1    0.0270         0.185
# scatter plot
xgb_rs %>%
  collect_predictions() %>%
  ggplot(mapping = aes(x = auc_024, y = .pred )) + 
  geom_point() +
  geom_smooth(method=lm) +
  labs(title = "Scatter plot ipred/DV", x = "True AUC isavuconazole", y = "predicted auc") + 
  theme_bw()

# or 
probably::cal_plot_regression(pred_obs_xgb, .pred,truth = auc_024 ) 

Fit and save wf for future use

##fit workflow necessaire pour faire prédiciton à partir de l'objet
fit_workflow <- fit(final_wf_xgb, isavuco_ml_train)

# ex prediciton à partir du wf pour 3 first patients de train
augment(fit_workflow, isavuco_ml_train %>% slice_tail(n =3))
## # A tibble: 3 × 11
##   .pred id     conc_0 conc_1 dose_group weight   age  dose auc_024 diff_conc
##   <dbl> <fct>   <dbl>  <dbl> <fct>       <dbl> <dbl> <dbl>   <dbl>     <dbl>
## 1  174. 289_15  15.5    20.7 15mg/kg      49.9 12.4   748.    174.      5.27
## 2  250. 290_15   9.65   10.9 15mg/kg      45.4  9.12  682.    252.      1.23
## 3  187. 295_15   8.76   10.2 15mg/kg      42.5  9.71  637.    186.      1.43
## # ℹ 1 more variable: ratio_conc <dbl>
# sauvegarde pour une utilisation ultérieure
 saveRDS(fit_workflow, file = str_c("xgboost_workshop_save_",today(),".rds"))

Example of prediction in a new patient

## prediction
#dput(names(isavuco_ml_train))
nouveau_patient <- tibble(id = "toto", conc_0 = 2.8, conc_1 = 5.8,  dose_group = "5mgkg", weight = 20, 
age = 5, dose = 100,  auc_024 =  38, diff_conc = conc_1 - conc_0, ratio_conc = conc_1/conc_0)
predict(fit_workflow, nouveau_patient)
## # A tibble: 1 × 1
##   .pred
##   <dbl>
## 1  39.0

linear model for benchmarking

model & workflow

https://parsnip.tidymodels.org/reference/linear_reg

  • penalty A non-negative number representing the total amount of regularization (specific engines only).

  • mixture A number between zero and one (inclusive) denoting the proportion of L1 regularization (i.e. lasso) in the model.

mixture = 1 specifies a pure lasso model,

mixture = 0 specifies a ridge regression model, and

0 < mixture < 1 specifies an elastic net model, interpolating lasso and ridge.

Here Mixture = 1 allows to have a LASSO penalisation while mixture = 0 is the ridge penalisation

# #model
##nouveau parametre stop_iter
lm_spec <- linear_reg(mode = "regression",
                        penalty = tune(),
                        mixture = 1
                        ) %>% 
  set_engine("glmnet")


#workflow model+recipe
lm_wf<- workflow() %>%
  add_recipe(isavuco_ml_rec) %>%
  add_model(lm_spec)

# dans le cas d'un modele lineaire non penalise sans preprocessing , on peut aussi utiliser à la place d'add recipe
# add_formula(auc ~ .)
#

#tuning

# narrower hyperparameter penalty range
narrower_penalty <- penalty(range = c(-3, 0))

set.seed(345)
tune_lm <- tune_grid(
  lm_wf,
  resamples = folds,
  grid = 10,
  param_info = parameters(narrower_penalty), 
  control = control_grid(verbose = TRUE, save_pred = TRUE, save_workflow = TRUE )
)

#visualisatin results

autoplot(tune_lm, metric = "rmse",  scientific = FALSE) + 
  ggtitle("tuning hyperparameter")

#cchoice of the best model
# best_rmse_lm <- select_best(tune_lm, "rmse",  maximize = F)

# choice of the simplest model
# best_penalty <- 
#   tune_lm %>%
#   select_by_one_std_err(-penalty, metric = "rmse")

best_penalty <- tune_lm %>% select_best()

best_penalty
## # A tibble: 1 × 2
##   penalty .config              
##     <dbl> <chr>                
## 1 0.00143 Preprocessor1_Model01
final_lm <- finalize_model(
  lm_spec,
  best_penalty
)

final_lm
## Linear Regression Model Specification (regression)
## 
## Main Arguments:
##   penalty = 0.00142539757010873
##   mixture = 1
## 
## Computational engine: glmnet
#finalize workflow
final_wf_lm <- workflow() %>%
  add_recipe(isavuco_ml_rec) %>%
  add_model(final_lm) %>% 
  fit(isavuco_ml_train)

#vip plot
lm_fit <- extract_fit_parsnip(final_wf_lm)
vip(lm_fit, geom = "point", num_features = 20) + theme_bw()

###resample#####

set.seed(123)
lm_rs<- fit_resamples(object = final_wf_lm, resamples = folds_cv, control = control_resamples(verbose=TRUE, save_pred = TRUE, save_workflow = TRUE))

##perf resample
lm_rs %>% collect_metrics()
## # A tibble: 2 × 6
##   .metric .estimator   mean     n std_err .config             
##   <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   43.6      10  1.63   Preprocessor1_Model1
## 2 rsq     standard    0.674    10  0.0184 Preprocessor1_Model1
pred_obs_lm<- lm_rs %>% collect_predictions() %>%
   mutate (bias_rel = (.pred - auc_024)/auc_024,
          bias_rel_square = bias_rel * bias_rel)
# write.csv2(pred_obs_xgb, file = "pred_obs_xgb_tac_c0h.csv")

#relative bias & rmse
pred_obs_lm %>% summarise(biais_rel = mean(bias_rel), relative_rmse = sqrt(mean(bias_rel_square)))
## # A tibble: 1 × 2
##   biais_rel relative_rmse
##       <dbl>         <dbl>
## 1    0.0451         0.483
lm_rs %>%
  collect_predictions() %>%
  ggplot(mapping = aes(x = .pred, y = auc_024)) + 
  geom_point() +
  geom_smooth(method=lm) +
  labs(title = "Scatter plot ipred/DV", x = "True AUC isavuconazole", y = "predicted auc") + 
  theme_bw()

##fit workflow
fit_workflow_lm <- fit(final_wf_lm, isavuco_ml_train)
 saveRDS(fit_workflow_lm, file = str_c("wf_reg_lm_workshop_save_",today(),".rds"))

MARS model for benchmarking

model & workflow

Multivariate Adaptive Regression Splines, or MARS, is an algorithm for complex non-linear regression problems. The MARS algorithm involves discovering a set of simple piecewise linear functions that characterize the data and using them in aggregate to make a prediction. In a sense, the model is an ensemble of linear functions. Each function is piecewise linear, with a knot at the value t. In the terminology of […], these are linear splines. How was the cut point determined? Each data point for each predictor is evaluated as a candidate cut point by creating a linear regression model with the candidate features, and the corresponding model error is calculated. Once the full set of features has been created, the algorithm sequentially removes individual features that do not contribute significantly to the model equation. This “pruning” procedure assesses each predictor variable and estimates how much the error rate was decreased by including it in the model.

https://parsnip.tidymodels.org/reference/details_mars_earth.html

  • num_terms The number of features that will be retained in the final model, including the intercept.

  • prod_degree The highest possible interaction degree.

# #model
##nouveau parametre stop_iter
mars_spec <- mars(mode = "regression",
                        num_terms = tune(),
                        prod_degree = tune()
                        ) %>% set_engine("earth")


#workflow model+recipe
mars_wf<- workflow() %>%
  add_recipe(isavuco_ml_rec) %>%
  add_model(mars_spec)
#

#tuning
set.seed(345)
tune_mars <- tune_grid(
  mars_wf,
  resamples = folds,
  grid = 20, 
  control = control_grid(verbose = TRUE, save_pred = TRUE, save_workflow = TRUE )
)

#visualisatin resultats

autoplot(tune_mars, metric = "rmse",  scientific = FALSE) + 
  ggtitle("tuning hyperparameter")

#choix du best model
best_rmse_mars <- select_best(tune_mars)

final_mars <- finalize_model(
  mars_spec,
  best_rmse_mars
)

final_mars
## MARS Model Specification (regression)
## 
## Main Arguments:
##   num_terms = 5
##   prod_degree = 1
## 
## Computational engine: earth
#finalize workflow
final_wf_mars <- workflow() %>%
  add_recipe(isavuco_ml_rec) %>%
  add_model(final_mars) %>% 
  fit(isavuco_ml_train)

#VIP plot
mars_fit <- extract_fit_parsnip(final_wf_mars)
vip(mars_fit, geom = "point", num_features = 20) + theme_bw()

###resample#####
set.seed(123)
mars_rs<- fit_resamples(object = final_wf_mars, resamples = folds_cv, control = control_resamples(verbose=TRUE, save_pred = TRUE, save_workflow = TRUE))

##perf resample
mars_rs %>% collect_metrics()
## # A tibble: 2 × 6
##   .metric .estimator   mean     n std_err .config             
##   <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   38.4      10  1.69   Preprocessor1_Model1
## 2 rsq     standard    0.742    10  0.0196 Preprocessor1_Model1
pred_obs_mars<- mars_rs %>% collect_predictions() %>%
   mutate (bias_rel = (.pred - auc_024)/auc_024,
          bias_rel_square = bias_rel * bias_rel)
# write.csv2(pred_obs_xgb, file = "pred_obs_xgb_tac_c0h.csv")

#relative bias & rmse
pred_obs_mars %>% summarise(biais_rel = mean(bias_rel), relative_rmse = sqrt(mean(bias_rel_square)))
## # A tibble: 1 × 2
##   biais_rel relative_rmse
##       <dbl>         <dbl>
## 1    0.0847         0.411
mars_rs %>%
  collect_predictions() %>%
  ggplot(mapping = aes(x = .pred, y = auc_024)) + 
  geom_point() +
  geom_smooth(method=lm) +
  labs(title = "Scatter plot ipred/DV", x = "True AUC isavuconazole", y = "predicted auc") + 
  theme_bw()

##fit workflow
fit_workflow_mars <- fit(final_wf_mars, isavuco_ml_train)
 saveRDS(fit_workflow_mars, file = str_c("wf_reg_mqrs_workshop_save_",today(),".rds"))

SVM polynomial model for benchmarking

SVM works by mapping data to a high-dimensional feature space so that data points can be categorized, even when the data are not otherwise linearly separable. A separator between the categories is found, then the data are transformed in such a way that the separator could be drawn as a hyperplane. Following this, characteristics of new data can be used to predict the group to which a new record should belong.

model & workflow

https://parsnip.tidymodels.org/reference/svm_poly.html

  • cost A positive number for the cost of predicting a sample within or on the wrong side of the margin

  • degree A positive number for polynomial degree.

  • scale_factor A positive number for the polynomial scaling factor.

  • margin A positive number for the epsilon in the SVM insensitive loss function (regression only)

#model
##nouveau parametre stop_iter
svm_spec <- svm_poly(mode = "regression",
                        cost = tune(),
                        degree = tune(),
                        margin = tune()
                        ) %>% set_engine("kernlab", importance = "permutation")


#workflow model+recipe
svm_wf<- workflow() %>%
  add_recipe(isavuco_ml_rec) %>%
  add_model(svm_spec)
#


#tuning
set.seed(345)

tune_svm <- tune_grid(
  svm_wf,
  resamples = folds,
  grid = 30, 
  control = control_grid(
    verbose = TRUE, save_pred = TRUE, save_workflow = TRUE
  )
)



#visualisatin resultats

autoplot(tune_svm, metric = "rmse",  scientific = FALSE) +
  ggtitle("tuning hyperparameter")

#choix du best model
best_rmse_svm <- select_best(tune_svm)

final_svm <- finalize_model(
  svm_spec,
  best_rmse_svm
)

final_svm
## Polynomial Support Vector Machine Model Specification (regression)
## 
## Main Arguments:
##   cost = 0.0173753879645583
##   degree = 3
##   margin = 0.0389190546469763
## 
## Engine-Specific Arguments:
##   importance = permutation
## 
## Computational engine: kernlab
#finalize workflow
final_wf_svm <- workflow() %>%
  add_recipe(isavuco_ml_rec) %>%
  add_model(final_svm) %>% 
  fit(isavuco_ml_train)

#vip
svm_fit <- extract_fit_parsnip(final_wf_svm)
#no vip plot navailable for svm
#vip(svm_fit, geom = "point", num_features = 20) + theme_bw()

###resample#####
set.seed(123)
svm_rs<- fit_resamples(object = final_wf_svm, resamples = folds_cv, control = control_resamples(verbose=TRUE, save_pred = TRUE, save_workflow = TRUE))

##perf resample
svm_rs %>% collect_metrics()
## # A tibble: 2 × 6
##   .metric .estimator   mean     n std_err .config             
##   <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   42.3      10  4.78   Preprocessor1_Model1
## 2 rsq     standard    0.732    10  0.0359 Preprocessor1_Model1
pred_obs_svm<- svm_rs %>% collect_predictions() %>%
   mutate (bias_rel = (.pred - auc_024)/auc_024,
          bias_rel_square = bias_rel * bias_rel)
# write.csv2(pred_obs_xgb, file = "pred_obs_xgb_tac_c0h.csv")

#relative bias & rmse
pred_obs_svm %>% summarise(biais_rel = mean(bias_rel), relative_rmse = sqrt(mean(bias_rel_square)))
## # A tibble: 1 × 2
##   biais_rel relative_rmse
##       <dbl>         <dbl>
## 1   -0.0812          1.35
svm_rs %>%
  collect_predictions() %>%
  ggplot(mapping = aes(x = .pred, y = auc_024)) +
  geom_point() +
  geom_smooth(method=lm) +
  labs(title = "Scatter plot ipred/DV", x = "True AUC isavuconazole", y = "predicted auc") + 
  theme_bw()

##fit workflow
fit_workflow_svm <- fit(final_wf_svm, isavuco_ml_train)
 saveRDS(fit_workflow_svm, file = str_c("wf_reg_svm_workshop_save_",today(),".rds"))

MLP : Multilayer perceptron via brulee

model & workflow

Tuning Parameters This model has 6 tuning parameters:

hidden_units: # Hidden Units (type: integer, default: 3L)

penalty: Amount of Regularization (type: double, default: 0.0)

epochs: # Epochs (type: integer, default: 100L)

dropout: Dropout Rate (type: double, default: 0.0)

learn_rate: Learning Rate (type: double, default: 0.01)

activation: Activation Function (type: character, default: ‘relu’)

#model
##nouveau parametre stop_iter
mlp_spec <- mlp(mode = "regression",
                        hidden_units = tune(),
                        dropout = tune(),
                        learn_rate = tune()
                        ) %>% set_engine("brulee", importance = "permutation")


#workflow model+recipe
mlp_wf<- workflow() %>%
  add_recipe(isavuco_ml_rec) %>%
  add_model(mlp_spec)
#

library(doParallel)
# Step 1: Set up parallel backend
num_cores <- parallel::detectCores() - 1  # Use all but one core to leave resources for the system
cl <- makeCluster(num_cores)             # Create a cluster with the specified number of cores
registerDoParallel(cl)                   # Register the cluster for parallel processing

# Inform the user about the parallel setup
message("Parallel backend set up with ", num_cores, " cores.")

# Step 2: Define the tuning process with parallel control
set.seed(234)  # Set seed for reproducibility

tune_mlp <- tune_grid(
  mlp_wf,
  resamples = folds,
  grid = 30, 
  control = control_grid(
        allow_par = TRUE,        # Enable parallel processing
    parallel_over = "everything",  # Parallelize across resamples and grid combinations,
    verbose = TRUE, 
    save_pred = TRUE, 
    save_workflow = TRUE
  )
)

# Step 3: Stop the cluster after tuning
stopCluster(cl)          # Shut down the parallel cluster
registerDoSEQ()          # Revert to sequential execution
message("Parallel backend stopped and reverted to sequential execution.")


#visualisatin resultats

autoplot(tune_mlp, metric = "rmse",  scientific = FALSE) +
  ggtitle("tuning hyperparameter")

#choix du best model
best_rmse_mlp <- select_best(tune_mlp)

final_mlp <- finalize_model(
  mlp_spec,
  best_rmse_mlp
)

final_mlp
## Single Layer Neural Network Model Specification (regression)
## 
## Main Arguments:
##   hidden_units = 6
##   dropout = 0.102566930372268
##   learn_rate = 0.126177204360878
## 
## Engine-Specific Arguments:
##   importance = permutation
## 
## Computational engine: brulee
#finalize workflow
final_wf_mlp <- workflow() %>%
  add_recipe(isavuco_ml_rec) %>%
  add_model(final_mlp) %>% 
  fit(isavuco_ml_train)

#vip
mlp_fit <- extract_fit_parsnip(final_wf_mlp)
#vip(mlp_fit, geom = "point", num_features = 20) + theme_bw()

###resample#####
set.seed(123)
mlp_rs<- fit_resamples(object = final_wf_mlp, resamples = folds_cv, control = control_resamples(verbose=TRUE, save_pred = TRUE, save_workflow = TRUE))

##perf resample
mlp_rs %>% collect_metrics()
## # A tibble: 2 × 6
##   .metric .estimator   mean     n std_err .config             
##   <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   39.5      10  1.90   Preprocessor1_Model1
## 2 rsq     standard    0.734    10  0.0226 Preprocessor1_Model1
pred_obs_mlp<- mlp_rs %>% collect_predictions() %>%
   mutate (bias_rel = (.pred - auc_024)/auc_024,
          bias_rel_square = bias_rel * bias_rel)
# write.csv2(pred_obs_xgb, file = "pred_obs_xgb_tac_c0h.csv")

#relative bias & rmse
pred_obs_mlp %>% summarise(biais_rel = mean(bias_rel), relative_rmse = sqrt(mean(bias_rel_square)))
## # A tibble: 1 × 2
##   biais_rel relative_rmse
##       <dbl>         <dbl>
## 1    0.0924         0.371
mlp_rs %>%
  collect_predictions() %>%
  ggplot(mapping = aes(y = .pred, x = auc_024)) +
  geom_point() +
  geom_smooth(method=lm) +
  labs(title = "Scatter plot ipred/DV", x = "True AUC isavuconazole", y = "predicted auc") + 
  theme_bw()

##fit workflow
fit_workflow_mlp <- fit(final_wf_mlp, isavuco_ml_train)
 saveRDS(fit_workflow_mlp, file = str_c("wf_reg_mlp_workshop_save_",today(),".rds"))

RF : Random Forest

model & workflow

Tuning Parameters This model has 3 tuning parameters:

mtry: # Randomly Selected Predictors (type: integer, default: see below)

trees: # Trees (type: integer, default: 500L)

min_n: Minimal Node Size (type: integer, default: see below)

#model
##nouveau parametre stop_iter
rf_spec <- rand_forest(mode = "regression",
                        mtry = tune(),
                        trees = 1000,
                        min_n = tune()
                        ) %>% set_engine("ranger", importance = "permutation")


#workflow model+recipe
rf_wf<- workflow() %>%
  add_recipe(isavuco_ml_rec) %>%
  add_model(rf_spec)
#


#tuning
set.seed(345)

tune_rf <- tune_grid(
  rf_wf,
  resamples = folds,
  grid = 20, 
  control = control_grid(
    verbose = TRUE, save_pred = TRUE, save_workflow = TRUE
  )
)



#visualisatin resultats

autoplot(tune_rf, metric = "rmse",  scientific = FALSE) +
  ggtitle("tuning hyperparameter")

#choix du best model
best_rmse_rf <- select_best(tune_rf)

final_rf <- finalize_model(
  rf_spec,
  best_rmse_rf
)

final_rf
## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   mtry = 4
##   trees = 1000
##   min_n = 8
## 
## Engine-Specific Arguments:
##   importance = permutation
## 
## Computational engine: ranger
#finalize workflow
final_wf_rf <- workflow() %>%
  add_recipe(isavuco_ml_rec) %>%
  add_model(final_rf) %>% 
  fit(isavuco_ml_train)

#vip
rf_fit <- extract_fit_parsnip(final_wf_rf)
vip(rf_fit, geom = "point", num_features = 20) + theme_bw()

###resample#####
set.seed(123)
rf_rs<- fit_resamples(object = final_wf_rf, resamples = folds_cv, control = control_resamples(verbose=TRUE, save_pred = TRUE, save_workflow = TRUE))

##perf resample
rf_rs %>% collect_metrics()
## # A tibble: 2 × 6
##   .metric .estimator   mean     n std_err .config             
##   <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   24.8      10  2.09   Preprocessor1_Model1
## 2 rsq     standard    0.890    10  0.0196 Preprocessor1_Model1
pred_obs_rf<- rf_rs %>% collect_predictions() %>%
   mutate (bias_rel = (.pred - auc_024)/auc_024,
          bias_rel_square = bias_rel * bias_rel)
# write.csv2(pred_obs_xgb, file = "pred_obs_xgb_tac_c0h.csv")

#relative bias & rmse
pred_obs_rf %>% summarise(biais_rel = mean(bias_rel), relative_rmse = sqrt(mean(bias_rel_square)))
## # A tibble: 1 × 2
##   biais_rel relative_rmse
##       <dbl>         <dbl>
## 1    0.0584         0.233
rf_rs %>%
  collect_predictions() %>%
  ggplot(mapping = aes(x = .pred, y = auc_024)) +
  geom_point() +
  geom_smooth(method=lm) +
  labs(title = "Scatter plot ipred/DV", x = "True AUC isavuconazole", y = "predicted auc") + 
  theme_bw()

##fit workflow
fit_workflow_rf <- fit(final_wf_rf, isavuco_ml_train)
 saveRDS(fit_workflow_rf, file = str_c("wf_reg_rf_workshop_save_",today(),".rds"))

Benchmark other models

mlp and random forest

https://www.tidymodels.org/find/parsnip/

For example random_forest: https://parsnip.tidymodels.org/reference/rand_forest.html Hyperparameters to tune * mtry An integer for the number of predictors that will be randomly sampled at each split when creating the tree models.

validation test

Use of Xgb

## last fit

final_res <- final_wf_xgb_non_fit %>% #mettre wf meilleures perf
  last_fit(isavuco_split)

# final_res <- final_wf_xgb %>% #mettre wf meilleures perf
#   augment(isavuco_ml_test)

## performances biais imprecision test
final_res %>% collect_metrics()
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      26.1   Preprocessor1_Model1
## 2 rsq     standard       0.897 Preprocessor1_Model1
final_res_predictions <- final_res %>% collect_predictions() %>%
  rename(AUC_pred = .pred) %>%
  mutate (bias_rel = (AUC_pred - auc_024)/auc_024,
          bias_rel_square = bias_rel * bias_rel)


rmarkdown::paged_table(as.data.frame(final_res_predictions %>% summarise(biais_rel = mean(bias_rel), relative_rmse = sqrt(mean(bias_rel_square)),biais_out_20percent = mean(!between(bias_rel,-0.2, 0.2)), nb_out_20percent = sum(!between(bias_rel,-0.2, 0.2)), n= n())))
#pred scatterplot
final_res_predictions %>%
  ggplot(mapping = aes(y =AUC_pred, x = auc_024)) +
  geom_point() +
geom_smooth(method=lm,color = "black")+labs(
    y = "Predicted AUC (mg*h/L)",
    x = "Reference AUC (mg*h/L)")+  theme_bw()

#residual scatterplot
final_res_predictions %>%
  ggplot(mapping = aes(x = AUC_pred, y = AUC_pred - auc_024)) +
  geom_point() +
geom_smooth(method=lm,color = "black")+labs(
    y = "Bias AUC (mg*h/L)",
    x = "Reference AUC (mg*h/L)")+  theme_bw()

# Passes data to the blandr.statistics function to generate Bland-Altman statistics
statistics.results <- blandr.statistics( final_res_predictions$AUC_pred , final_res_predictions$auc_024 )

# Generates a ggplot, with title changed
blandr.plot.ggplot( statistics.results , plotTitle = "" , ciDisplay = F, ciShading = FALSE) + theme_bw()

VIP plot test set

xgb_fit <- extract_fit_parsnip(final_res)
vip(xgb_fit, geom = "point", num_features = 10) + theme_bw()

Shapvalues

#
library(shapviz)

isavuco_ml_rec  <- recipe(auc_024 ~ ., data = isavuco_ml_train) %>%
  update_role(id, new_role = "id") %>%
  step_rm(dose_group) %>%
  step_log(contains("conc"), offset = 0.0001) %>%
  step_YeoJohnson(dose) %>%
 step_normalize(all_numeric_predictors())
# si categorical cov, one hot encoding avec step_dummy --> allonge modele rf or xgb : garder factor dans ces modeles
# si trop de categories reduire avec step_other

isavuco_ml_rec_prep <-  prep(isavuco_ml_rec )
isavuco_train_recipe <-bake(isavuco_ml_rec_prep, new_data = NULL)
isavuco_test_recipe <-bake(isavuco_ml_rec_prep, new_data = isavuco_ml_test)


xgb_test_recipe_shap <- bake(
  prep(isavuco_ml_rec), 
  has_role("predictor"),
  new_data = isavuco_ml_test, 
  composition = "matrix"
)


shap <- shapviz(extract_fit_engine(final_wf_xgb), X_pred = xgb_test_recipe_shap, X = isavuco_test_recipe)

# script to change or Translate the variable names
# translations <- c(
#   "Dose_matin" = "Morning_dose",
#   "age" = "age",
#   "Type_greffe_rein" = "Organ_transplanted_kidney",
#     "Type_greffe_moelle" = "Organ_transplanted_HSCT",
#     "Type_greffe_cardiac" = "Organ_transplanted_heart",
#       "Type_greffe_foie" = "Organ_transplanted_liver",
#       "Type_greffe_poumon" = "Organ_transplanted_lung",
#         "Type_greffe_autre" = "Organ_transplanted_other",
#   "Delai_post_greffe" = "Time_post_transplantation"
 #)

# # Rename columns of shap$X (data frame)
# if (!is.null(shap$X) && is.data.frame(shap$X)) {
#   colnames(shap$X) <- map_chr(
#     colnames(shap$X),
#     ~ ifelse(.x %in% names(translations), translations[.x], .x)
#   )
# }
#
# # Rename columns of shap$S (SHAP matrix)
# if (!is.null(shap$S) && is.matrix(shap$S)) {
#   colnames(shap$S) <- map_chr(
#     colnames(shap$S),
#     ~ ifelse(.x %in% names(translations), translations[.x], .x)
#   )
# }

# Check results
# print(colnames(shap$X))
# print(colnames(shap$S))

# Generate SHAP importance plots
shap_imp <- sv_importance(shap, kind = "both", show_numbers = TRUE, max_display = 40L) +
  ggtitle("SHAP Importance")

# Display the plots
print(shap_imp)

dependence plots

# Generate dependence plots for each class
dep_plot <- sv_dependence(shap, "conc_0", color_var = "dose") +
  ggtitle("Dependence Plot")

# Display the plots
print(dep_plot)

individual force plot transformed scale

# Generate force plot for Class 1
force_plot <- sv_force(shap, row_id = 80) +
  ggtitle("Force Plot")

# Display the plots
print(force_plot)

# warerfall plot$
sv_waterfall(shap, row_id = 1) +
  ggtitle("Force Plot")

plot in the original scale: reverse transform

# Extract SHAP values from the trained model
shap_values <- shapviz(extract_fit_engine(final_wf_xgb), 
                       X_pred = xgb_test_recipe_shap, 
                       X = isavuco_test_recipe)

# Retrieve preprocessed data and transformation parameters
normalize_step <- isavuco_ml_rec_prep$steps[[which(sapply(isavuco_ml_rec_prep$steps, function(x) "step_normalize" %in% class(x)))]]
means <- normalize_step$means
sds <- normalize_step$sds

# Convert to tibble to ensure compatibility
shap_values$X <- as_tibble(shap_values$X)

# Reverse normalization transformation
reverse_transform <- function(data, means, sds) {
  for (col in names(means)) {
    if (col %in% colnames(data)) {
      data[[col]] <- data[[col]] * sds[col] + means[col]
    }
  }
  return(data)
}

# Apply inverse transformation
shap_values$X <- reverse_transform(shap_values$X, means, sds)

# Reverse log transformation for concentration variables
log_vars <- grep("conc", colnames(shap_values$X), value = TRUE)
for (col in log_vars) {
  shap_values$X[[col]] <- exp(shap_values$X[[col]]) - 0.0001  # Reverse log transformation
}

# SHAP Force Plot (on original scale)
sv_force(shap_values, row_id = 200) + ggtitle("SHAP Force Plot (Original Scale)")

sv_force(shap_values, row_id = 80) + ggtitle("SHAP Force Plot (Original Scale)")

# SHAP Dependence Plot (on original scale)
dep_plot <- sv_dependence(shap_values, "conc_0", color_var = "dose") +
  ggtitle("SHAP Dependence Plot (Original Scale)")

print(dep_plot)

Explainer

library(DALEXtra)
## creation of explainer
explainer_external <- explain_tidymodels(
  model = fit_workflow, 
  data = isavuco_ml_train, 
  y = isavuco_ml_train$auc_024,
  label = "xgb")
## Preparation of a new explainer is initiated
##   -> model label       :  xgb 
##   -> data              :  659  rows  10  cols 
##   -> data              :  tibble converted into a data.frame 
##   -> target variable   :  659  values 
##   -> predict function  :  yhat.workflow  will be used (  default  )
##   -> predicted values  :  No value for predict function target column. (  default  )
##   -> model_info        :  package tidymodels , ver. 1.1.1 , task regression (  default  ) 
##   -> predicted values  :  numerical, min =  15.02153 , mean =  129.0577 , max =  387.1656  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -15.61081 , mean =  -0.02124672 , max =  21.86915  
##   A new explainer has been created!

Breakdown plot

https://github.com/pbiecek/breakDown & https://arxiv.org/abs/1804.01955

There are multiple possible approaches to understanding why a model predicts a given auc. One is a break-down explanation: it computes how contributions attributed to individual features change the mean model’s prediction for a particular observation.

bd_plot <- predict_parts(explainer = explainer_external,
                       new_observation = slice_head(isavuco_ml_test, n=1),
                       type = "break_down")
plot(bd_plot, max_features = 5)

Partial dependence plot

Partial dependence profiles show how the expected value of a model prediction changes as a function of a feature.

pdp_diff_conc_1 <- model_profile(
  explainer_external,
  variables = "conc_0",
  N = NULL # nombre observation a utilsier de notre trainiong set si null=all
)
plot(pdp_diff_conc_1)

Applicability domain

The approach used is a fairly simple unsupervised method that attempts to measure how much (if any) a new data point is beyond the training data. The idea is to accompany a prediction with a score that measures how similar the new point is to the training set. A percentile can be computed for new samples that reflect how much of the training set is less extreme than the new samples.

library(applicable)
pca_stat <- apd_pca(isavuco_ml_rec, data = isavuco_ml_train , threshold = 0.95)
pca_stat
## # Predictors:
##    8
## # Principal Components:
##    4 components were needed
##    to capture at least 95% of the
##    total variation in the predictors.
autoplot(pca_stat, distance) + labs(x = "distance")

Compute percentile in new data

score(pca_stat, isavuco_ml_test %>% slice_sample(n=1)) %>% select(starts_with("distance")) %>% arrange(desc(distance_pctl))
## # A tibble: 1 × 2
##   distance distance_pctl
##      <dbl>         <dbl>
## 1     6.57          99.3

Ensemble models

isavuco_data_st <- 
  stacks() %>%
  #results of tune_grid
  add_candidates(tune_lm) %>%
  add_candidates(tune_rf) %>% 
  add_candidates(tune_mlp) %>% 
  add_candidates(tune_xgb) %>% 
  add_candidates(tune_mars) %>%
  add_candidates(tune_svm)
  

as_tibble(isavuco_data_st)
## # A tibble: 659 × 153
##    auc_024 tune_lm_1_01 tune_lm_1_07 tune_lm_1_08 tune_lm_1_09 tune_lm_1_10
##      <dbl>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
##  1    69.1         69.0         69.0         69.1         69.9         70.8
##  2    64.1         89.0         89.0         89.2         90.0         91.1
##  3    43.4         69.5         69.5         69.7         70.8         72.2
##  4    46.3         71.8         71.9         72.1         73.4         75.1
##  5    68.3        114.         114.         114.         114.         114. 
##  6    43.2         43.4         43.4         43.6         44.6         45.9
##  7    55.9        102.         102.         102.         100.          98.1
##  8    54.6         48.2         48.2         48.4         49.6         51.1
##  9    53.1         14.7         14.7         14.9         15.9         17.2
## 10    57.0         49.3         49.3         49.5         50.8         52.5
## # ℹ 649 more rows
## # ℹ 147 more variables: tune_rf_1_08 <dbl>, tune_rf_1_07 <dbl>,
## #   tune_rf_1_17 <dbl>, tune_rf_1_06 <dbl>, tune_rf_1_01 <dbl>,
## #   tune_rf_1_14 <dbl>, tune_rf_1_11 <dbl>, tune_rf_1_05 <dbl>,
## #   tune_rf_1_19 <dbl>, tune_rf_1_13 <dbl>, tune_rf_1_09 <dbl>,
## #   tune_rf_1_10 <dbl>, tune_rf_1_02 <dbl>, tune_rf_1_12 <dbl>,
## #   tune_rf_1_03 <dbl>, tune_rf_1_18 <dbl>, tune_rf_1_20 <dbl>, …

Fit the stack

The blend_predictions function determines how member model output will ultimately be combined in the final prediction by fitting a LASSO model on the data stack, predicting the true assessment set outcome using the predictions from each of the candidate members. Candidates with nonzero stacking coefficients become members.

conflicted::conflicts_prefer(brulee::coef)

set.seed(1234)
isavuco_model_st <-
  isavuco_data_st %>%
  blend_predictions()

Plots

This evaluates the meta-learning model over a predefined grid of lasso penalty values and uses an internal resampling method to determine the best value. The autoplot() method, shown in Figure 20.1, helps us understand if the default penalization method was sufficient:

autoplot(isavuco_model_st)

autoplot(isavuco_model_st, type = "members")

The autoplot() method can be used again to show the contributions of each model type,

autoplot(isavuco_model_st, type = "weights")

Fit

Model stacks can be thought of as a group of fitted member models and a set of instructions on how to combine their predictions.

set.seed(1234)
isavuco_model_st_fitted <-
  isavuco_model_st %>%
  fit_members()

make then predictions in the test set

using the stack

reg_metrics <- metric_set(rmse, rsq)

stack_test_predictions <- 
augment(isavuco_model_st_fitted, isavuco_ml_test)

stack_test_predictions %>% 
  reg_metrics(auc_024, .pred)
## # A tibble: 2 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      25.6  
## 2 rsq     standard       0.900
stack_test_predictions %>%
  ggplot(mapping = aes(y =.pred, x = auc_024)) +
  geom_point() +
geom_smooth(method=lm,color = "black")+labs(
    y = "Predicted AUC (mg*h/L)",
    x = "Reference AUC (mg*h/L)")+  theme_bw()

# ggplot(isavuco_ml_test) +
#   aes(x = auc_024, 
#       y = .pred) +
#   geom_point() + 
#   coord_obs_pred()

Comparison stack object and indivudal members predictions for rmse for example

# Generate predictions for each ensemble member
member_preds <- isavuco_ml_test %>%
  select(auc_024) %>%
  bind_cols(predict(isavuco_model_st_fitted, isavuco_ml_test, members = TRUE))

# Compute RMSE for each member model correctly
rmse_results <- member_preds %>%
  select(-auc_024) %>%  # Exclude the true values from mapping
  map_df(~ rmse_vec(truth = member_preds$auc_024, estimate = .x), .id = "model")

# Display results in a table
rmarkdown::paged_table(rmse_results)